# Import Libraries
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.cluster import KMeans, AgglomerativeClustering
import random
from sklearn.metrics import silhouette_score
random.seed(42)
# warnings ignored
import warnings
warnings.filterwarnings("ignore")
import plotly
from plotly import __version__
print(__version__)
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.offline as pyoff
from plotly.offline import init_notebook_mode , plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected = True)
import plotly.figure_factory as ff
import datetime as dt
!pip install chart-studio
import chart_studio.plotly as py
4.4.1
Requirement already satisfied: chart-studio in /usr/local/lib/python3.7/dist-packages (1.1.0) Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from chart-studio) (2.23.0) Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from chart-studio) (1.15.0) Requirement already satisfied: retrying>=1.3.3 in /usr/local/lib/python3.7/dist-packages (from chart-studio) (1.3.3) Requirement already satisfied: plotly in /usr/local/lib/python3.7/dist-packages (from chart-studio) (4.4.1) Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->chart-studio) (2.10) Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->chart-studio) (1.24.3) Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->chart-studio) (2020.12.5) Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->chart-studio) (3.0.4)
Function for getting Plotly images in Colab
def configure_plotly_browser_state():
import IPython
display(IPython.core.display.HTML('''
<script src="/static/components/requirejs/require.js"></script>
<script>
requirejs.config({
paths: {
base: '/static/base',
plotly: 'https://cdn.plot.ly/plotly-latest.min.js?noext',
},
});
</script>
'''))
retail = pd.read_excel("/content/drive/MyDrive/project-4/Notebooks and Data/data/online_retail_normalized.xlsx")
retail.sample(5, random_state = 42)
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | CancelledQuantity | TotalPrice | |
|---|---|---|---|---|---|---|---|---|---|---|
| 85446 | 547656 | 21166 | COOK WITH WINE METAL SIGN | 3 | 2011-03-24 12:53:00 | 2.08 | 13263 | United Kingdom | 0 | 6.24 |
| 164784 | 557960 | 22170 | PICTURE FRAME WOOD TRIPLE PORTRAIT | 1 | 2011-06-23 19:46:00 | 6.75 | 15453 | United Kingdom | 0 | 6.75 |
| 134906 | 554079 | 84978 | HANGING HEART JAR T-LIGHT HOLDER | 12 | 2011-05-22 11:41:00 | 1.25 | 14546 | United Kingdom | 0 | 15.00 |
| 197728 | 562128 | 22720 | SET OF 3 CAKE TINS PANTRY DESIGN | 3 | 2011-08-03 09:07:00 | 4.95 | 16150 | United Kingdom | 0 | 14.85 |
| 42314 | 542231 | 23230 | WRAP ALPHABET DESIGN | 25 | 2011-01-26 13:40:00 | 0.42 | 16714 | United Kingdom | 0 | 10.50 |
missing_dict = dict(retail.isnull().sum())
missing_df = pd.DataFrame.from_dict(missing_dict, orient = 'index')
missing_df = missing_df.reset_index().rename(columns = {0:'missing'})
missing_df
| index | missing | |
|---|---|---|
| 0 | InvoiceNo | 0 |
| 1 | StockCode | 0 |
| 2 | Description | 0 |
| 3 | Quantity | 0 |
| 4 | InvoiceDate | 0 |
| 5 | UnitPrice | 0 |
| 6 | CustomerID | 0 |
| 7 | Country | 0 |
| 8 | CancelledQuantity | 0 |
| 9 | TotalPrice | 0 |
retail.columns.values
array(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
'UnitPrice', 'CustomerID', 'Country', 'CancelledQuantity',
'TotalPrice'], dtype=object)
uk_retail = retail.drop(['CancelledQuantity', 'TotalPrice'], axis = 1)
uk_retail.shape # Check shape after dropping the columns
(390453, 8)
United Kingdom
#tx_data = pd.read_csv('data.csv')
#convert the string date field to datetime
uk_retail['InvoiceDate'] = pd.to_datetime(uk_retail['InvoiceDate'])
#we will be using only UK data
uk = uk_retail.query("Country=='United Kingdom'").reset_index(drop=True)
RFM Stands for Recency, Frequency and Monetary
Recency
To calculate recency, we need to find out most recent purchase date of each customer and see how many days they are inactive for. we will apply K-means clustering to assign customers a recency score.
#create a generic user dataframe to keep CustomerID and new segmentation scores
uk_user = pd.DataFrame(uk_retail['CustomerID'].unique())
uk_user.columns = ['CustomerID']
uk_max_purchase = uk.groupby('CustomerID').InvoiceDate.max().reset_index()
uk_max_purchase.columns = ['CustomerID','MaxPurchaseDate']
uk_max_purchase['Recency'] = (uk_max_purchase['MaxPurchaseDate'].max() - uk_max_purchase['MaxPurchaseDate']).dt.days
uk_user = pd.merge(uk_user, uk_max_purchase[['CustomerID','Recency']], on='CustomerID')
uk_user.head()
| CustomerID | Recency | |
|---|---|---|
| 0 | 17850 | 371 |
| 1 | 13047 | 56 |
| 2 | 13748 | 95 |
| 3 | 15100 | 333 |
| 4 | 15291 | 25 |
Frequency
find total number orders for each customer.
uk_frequency = uk.groupby('CustomerID').InvoiceDate.count().reset_index()
uk_frequency.columns = ['CustomerID', 'Frequency']
uk_frequency.head(3)
| CustomerID | Frequency | |
|---|---|---|
| 0 | 12747 | 103 |
| 1 | 12748 | 4378 |
| 2 | 12749 | 199 |
# add this data to main dataframe
uk_user = pd.merge(uk_user, uk_frequency, on = 'CustomerID')
uk_user['Frequency'].describe()
count 3913.000000 mean 88.778942 std 213.411626 min 1.000000 25% 16.000000 50% 40.000000 75% 97.000000 max 7649.000000 Name: Frequency, dtype: float64
Revenue
uk['Revenue'] = uk['Quantity'] * uk['UnitPrice']
uk_revenue = uk.groupby('CustomerID').Revenue.sum().reset_index()
# merge it with orignal dataframe
uk_user = pd.merge(uk_user, uk_revenue, on = 'CustomerID')
uk_user.sample(3)
| CustomerID | Recency | Frequency | Revenue | |
|---|---|---|---|---|
| 2733 | 17405 | 0 | 82 | 1031.41 |
| 3910 | 15520 | 1 | 18 | 343.50 |
| 898 | 16850 | 333 | 19 | 332.41 |
Find Recency, Frequency and Revenue Cluster
# Apply k-means Elbow method.
from sklearn.cluster import KMeans
interia={}
uk_recency = uk_user[['Recency']]
for k in range(2, 10):
kmeans = KMeans(n_clusters=k, max_iter=500).fit(uk_recency)
uk_recency["clusters"] = kmeans.labels_
interia[k] = kmeans.inertia_
plt.figure()
plt.plot(list(interia.keys()), list(interia.values()), linestyle = 'dashdot' , marker = '*',markersize = 10, markeredgecolor= 'm')
plt.xlabel("Number of cluster")
plt.ylabel("Interia")
plt.title("Elbow Plot", weight = 'bold', size = 15)
plt.show()
score_list = []
list1 = [2,3,4,5,6,7,8, 9, 10, 11,12]
for n_clusters in list1 :
clusterer = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = clusterer.fit_predict(uk_recency)
silhouette_avg = silhouette_score(uk_recency, cluster_labels)
score_list.append(silhouette_avg)
#print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg)
print(score_list)
[0.7390390612208076, 0.6758265781129741, 0.6149118594385379, 0.61579975358264, 0.6056212223851841, 0.5717851353946662, 0.5762341755093784, 0.5837078278186719, 0.566991463287418, 0.5659005238003059, 0.5630275208900551]
silhouette_df = pd.DataFrame(score_list,index = list1)
silhouette_df.rename(columns={0:'silhouette_score'}, inplace= True)
plt.figure(figsize=(8,4))
sns.set_theme(style= 'darkgrid', palette= 'cool')
ax = sns.lineplot(x = list1, y = 'silhouette_score', data = silhouette_df, linestyle = '--', color = 'blue')
ax.set(xlabel = 'Number of Clusters')
plt.show()
from yellowbrick.cluster import SilhouetteVisualizer
sns.set_theme(style= 'whitegrid')
fig, ax = plt.subplots(2, 2, figsize=(20,10))
for i in [2, 3, 4, 5]:
#Create KMeans instance for different number of clusters
#divmod() method takes two numbers and returns a pair of numbers (a tuple) consisting of their quotient and remainder
km = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=100, random_state=42)
q, mod = divmod(i, 2)
# Create SilhouetteVisualizer instance with KMeans instance Fit the visualizer
visualizer = SilhouetteVisualizer(km, colors='yellowbrick', ax=ax[q-1][mod])
visualizer.fit(uk_recency)
Elbow method and Silhouetee both methods are giving k = 4
#build 4 clusters for recency and add it to dataframe
kmeans = KMeans(n_clusters=4)
kmeans.fit(uk_user[['Recency']])
uk_user['RecencyCluster'] = kmeans.predict(uk_user[['Recency']])
#function for ordering cluster numbers
def order_cluster(cluster_field_name, target_field_name,df,ascending):
new_cluster_field_name = 'new_' + cluster_field_name
df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
df_new['index'] = df_new.index
df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
df_final = df_final.drop([cluster_field_name],axis=1)
df_final = df_final.rename(columns={"index":cluster_field_name})
return df_final
uk_user = order_cluster('RecencyCluster', 'Recency',uk_user,False)
uk_user.groupby('RecencyCluster')['Recency'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| RecencyCluster | ||||||||
| 0 | 443.0 | 307.361174 | 39.687929 | 250.0 | 272.5 | 303.0 | 337.0 | 373.0 |
| 1 | 552.0 | 191.851449 | 31.558485 | 138.0 | 165.0 | 189.0 | 217.0 | 249.0 |
| 2 | 938.0 | 82.250533 | 24.248186 | 51.0 | 63.0 | 76.0 | 99.0 | 137.0 |
| 3 | 1980.0 | 19.070202 | 14.166755 | 0.0 | 7.0 | 17.0 | 30.0 | 50.0 |
# find frequency Cluster
kmeans.fit(uk_user[['Frequency']])
uk_user['FrequencyCluster'] = kmeans.predict(uk_user[['Frequency']])
uk_user = order_cluster('FrequencyCluster', 'Frequency',uk_user,True)
uk_user.groupby('FrequencyCluster')['Frequency'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| FrequencyCluster | ||||||||
| 0 | 3470.0 | 48.396254 | 43.689040 | 1.0 | 15.00 | 33.0 | 71.00 | 185.0 |
| 1 | 418.0 | 321.684211 | 129.829535 | 186.0 | 221.00 | 279.0 | 389.25 | 785.0 |
| 2 | 22.0 | 1266.136364 | 483.737266 | 824.0 | 946.75 | 1093.5 | 1420.75 | 2650.0 |
| 3 | 3.0 | 5712.666667 | 1716.497111 | 4378.0 | 4744.50 | 5111.0 | 6380.00 | 7649.0 |
# find Revenue Cluster
kmeans.fit(uk_user[['Revenue']])
uk_user['RevenueCluster'] = kmeans.predict(uk_user[['Revenue']])
uk_user = order_cluster('RevenueCluster', 'Revenue',uk_user,True)
uk_user.groupby('RevenueCluster')['Revenue'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| RevenueCluster | ||||||||
| 0 | 3711.0 | 991.789672 | 1043.473435 | 0.00 | 280.9650 | 596.090 | 1335.8100 | 5320.63 |
| 1 | 179.0 | 9723.838603 | 5214.879651 | 5343.28 | 6286.3000 | 7792.360 | 10899.1550 | 28882.44 |
| 2 | 21.0 | 51036.268095 | 16018.001763 | 31499.18 | 36910.4900 | 51527.300 | 60044.4100 | 90704.38 |
| 3 | 2.0 | 224632.415000 | 49532.667388 | 189607.53 | 207119.9725 | 224632.415 | 242144.8575 | 259657.30 |
#Overall scoring
uk_user['OverallScore'] = uk_user['RecencyCluster'] + uk_user['FrequencyCluster'] + uk_user['RevenueCluster']
uk_user.groupby('OverallScore')['Recency','Frequency','Revenue'].mean()
| Recency | Frequency | Revenue | |
|---|---|---|---|
| OverallScore | |||
| 0 | 307.295455 | 22.090909 | 343.166591 |
| 1 | 192.610294 | 31.226103 | 510.063108 |
| 2 | 83.733186 | 45.606395 | 894.562869 |
| 3 | 22.476099 | 66.712556 | 1172.725762 |
| 4 | 14.570492 | 273.727869 | 3889.258984 |
| 5 | 9.890756 | 404.663866 | 10782.278992 |
| 6 | 6.450000 | 917.800000 | 27310.926000 |
| 7 | 3.000000 | 1252.000000 | 118296.000000 |
| 8 | 1.333333 | 5712.666667 | 45657.486667 |
uk_Frequency_cluster_0 = uk_user.loc[(uk_user['FrequencyCluster']==0)]
uk_Frequency_cluster_1 = uk_user.loc[(uk_user['FrequencyCluster']==1)]
uk_Frequency_cluster_2 = uk_user.loc[(uk_user['FrequencyCluster']==2)]
uk_Frequency_cluster_3 = uk_user.loc[(uk_user['FrequencyCluster']==3)]
trace0 = go.Box(y = uk_Frequency_cluster_0, name = 'Cluster0')
trace1 = go.Box(y = uk_Frequency_cluster_1, name = 'Cluster1')
trace2 = go.Box(y = uk_Frequency_cluster_2, name = 'Cluster2')
trace3 = go.Box(y = uk_Frequency_cluster_3, name = 'Cluster3')
configure_plotly_browser_state()
init_notebook_mode(connected=False)
data = [trace0, trace1, trace2, trace3]
layout = go.Layout(title = 'Frequency:Clusters')
fig = go.Figure(data = data, layout = layout)
# call Figure object
iplot(fig)
r_cluster_0 = uk_user.loc[(uk_user['RecencyCluster']==0)]
r_cluster_1 = uk_user.loc[(uk_user['RecencyCluster']==1)]
r_cluster_2 = uk_user.loc[(uk_user['RecencyCluster']==2)]
r_cluster_3 = uk_user.loc[(uk_user['RecencyCluster']==3)]
trace0 = go.Box(y = r_cluster_0, name = 'Cluster0')
trace1 = go.Box(y = r_cluster_1, name = 'Cluster1')
trace2 = go.Box(y = r_cluster_2, name = 'Cluster2')
trace3 = go.Box(y = r_cluster_3, name = 'Cluster3')
configure_plotly_browser_state()
init_notebook_mode(connected=False)
data = [trace0, trace1, trace2, trace3]
layout = go.Layout(title = 'Recency:Clusters')
fig = go.Figure(data = data, layout = layout)
# call Figure object
iplot(fig)
re_cluster_0 = uk_user.loc[(uk_user['RevenueCluster']==0)]
re_cluster_1 = uk_user.loc[(uk_user['RevenueCluster']==1)]
re_cluster_2 = uk_user.loc[(uk_user['RevenueCluster']==2)]
re_cluster_3 = uk_user.loc[(uk_user['RevenueCluster']==3)]
trace0 = go.Box(y = re_cluster_0, name = 'Cluster0')
trace1 = go.Box(y = re_cluster_1, name = 'Cluster1')
trace2 = go.Box(y = re_cluster_2, name = 'Cluster2')
trace3 = go.Box(y = re_cluster_3, name = 'Cluster3')
configure_plotly_browser_state()
init_notebook_mode(connected=False)
data = [trace0, trace1, trace2, trace3]
layout = go.Layout(title = 'Revenue:Clusters')
fig = go.Figure(data = data, layout = layout)
# call Figure object
iplot(fig)
#Overall scoring
uk_user['OverallScore'] = uk_user['RecencyCluster'] + uk_user['FrequencyCluster'] + uk_user['RevenueCluster']
uk_user.groupby('OverallScore')['Recency','Frequency','Revenue'].mean()
| Recency | Frequency | Revenue | |
|---|---|---|---|
| OverallScore | |||
| 0 | 307.295455 | 22.090909 | 343.166591 |
| 1 | 192.610294 | 31.226103 | 510.063108 |
| 2 | 83.733186 | 45.606395 | 894.562869 |
| 3 | 22.476099 | 66.712556 | 1172.725762 |
| 4 | 14.570492 | 273.727869 | 3889.258984 |
| 5 | 9.890756 | 404.663866 | 10782.278992 |
| 6 | 6.450000 | 917.800000 | 27310.926000 |
| 7 | 3.000000 | 1252.000000 | 118296.000000 |
| 8 | 1.333333 | 5712.666667 | 45657.486667 |
uk_user['Segment'] = 'Low-Value'
uk_user.loc[uk_user['OverallScore']>2,'Segment'] = 'Mid-Value'
uk_user.loc[uk_user['OverallScore']>4,'Segment'] = 'High-Value'
uk_user.sample(5, random_state= 42)
| CustomerID | Recency | Frequency | Revenue | RecencyCluster | FrequencyCluster | RevenueCluster | OverallScore | Segment | |
|---|---|---|---|---|---|---|---|---|---|
| 3263 | 18184 | 14 | 1 | 49.80 | 3 | 0 | 0 | 3 | Mid-Value |
| 3594 | 16226 | 202 | 8 | 255.12 | 1 | 0 | 0 | 1 | Low-Value |
| 1138 | 13038 | 79 | 28 | 908.46 | 2 | 0 | 0 | 2 | Low-Value |
| 1432 | 13075 | 129 | 22 | 919.61 | 2 | 0 | 0 | 2 | Low-Value |
| 3414 | 16034 | 195 | 22 | 437.70 | 1 | 0 | 0 | 1 | Low-Value |
#Revenue vs Frequency
configure_plotly_browser_state()
uk_graph = uk_user.query("Revenue < 50000 and Frequency < 2000")
plot_data = [
go.Scatter(
x=uk_graph.query("Segment == 'Low-Value'")['Frequency'],
y=uk_graph.query("Segment == 'Low-Value'")['Revenue'],
mode='markers',
name='Low',marker= dict(size= 9,line= dict(width=1),color= 'red',opacity= 0.8)
),
go.Scatter(
x=uk_graph.query("Segment == 'Mid-Value'")['Frequency'],
y=uk_graph.query("Segment == 'Mid-Value'")['Revenue'],
mode='markers',
name='Mid',
marker= dict(size= 10,line= dict(width=1),color= 'lime',opacity= 0.5)
),
go.Scatter(
x=uk_graph.query("Segment == 'High-Value'")['Frequency'],
y=uk_graph.query("Segment == 'High-Value'")['Revenue'],
mode='markers',
name='High',marker= dict(size= 11,line= dict(width=1),color= 'gold',opacity= 0.9)
),
]
plot_layout = go.Layout(
yaxis= {'title': "Revenue"},
xaxis= {'title': "Frequency"},
title='Revenue :Frequency',width = 600, height = 400
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
uk_graph = uk_user.query("Revenue < 50000 and Frequency < 2000")
configure_plotly_browser_state()
plot_data = [
go.Scatter(
x=uk_graph.query("Segment == 'Low-Value'")['Recency'],
y=uk_graph.query("Segment == 'Low-Value'")['Frequency'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'crimson',
opacity= 0.8
)
),
go.Scatter(
x=uk_graph.query("Segment == 'Mid-Value'")['Recency'],
y=uk_graph.query("Segment == 'Mid-Value'")['Frequency'],
mode='markers',
name='Mid',
marker= dict(size= 9,
line= dict(width=1),
color= 'lime',
opacity= 0.5
)
),
go.Scatter(
x=uk_graph.query("Segment == 'High-Value'")['Recency'],
y=uk_graph.query("Segment == 'High-Value'")['Frequency'],
mode='markers',
name='High', marker= dict(size= 11, line= dict(width=1), color= 'gold', opacity= 0.9
)
),
]
plot_layout = go.Layout(
yaxis= {'title': "Recency"},
xaxis= {'title': "Frequency"},
title='Recency :Frequency',width = 600, height = 400
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
#Revenue vs Frequency
configure_plotly_browser_state()
init_notebook_mode(connected= True)
uk_graph = uk_user.query("Revenue < 50000 and Frequency < 2000")
plot_data = [
go.Scatter(
x=uk_graph.query("Segment == 'Low-Value'")['Recency'],
y=uk_graph.query("Segment == 'Low-Value'")['Revenue'],
mode='markers',
name='Low',marker= dict(size= 9,line= dict(width=1),color= 'crimson',opacity= 0.8)
),
go.Scatter(
x=uk_graph.query("Segment == 'Mid-Value'")['Recency'],
y=uk_graph.query("Segment == 'Mid-Value'")['Revenue'],
mode='markers',
name='Mid',
marker= dict(size= 10,line= dict(width=1),color= 'lime',opacity= 0.5)
),
go.Scatter(
x=uk_graph.query("Segment == 'High-Value'")['Recency'],
y=uk_graph.query("Segment == 'High-Value'")['Revenue'],
mode='markers',
name='High',marker= dict(size= 11,line= dict(width=1),color= 'gold',opacity= 0.9)
),
]
plot_layout = go.Layout(
yaxis= {'title': "Revenue"},
xaxis= {'title': "Frequency"},
title='Recency :Revenue',width = 600, height = 400
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)